{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## IJCAI urls\n",
    "\n",
    "IJCAI官网使用AJAX...直接另存为html最省事"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "from lxml import etree\n",
    "from utils import listdir"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['IJCAI_htmls\\\\view-source_https___www.ijcai.org_Proceedings_2015.html',\n",
       " 'IJCAI_htmls\\\\view-source_https___www.ijcai.org_Proceedings_2016.html',\n",
       " 'IJCAI_htmls\\\\view-source_https___www.ijcai.org_Proceedings_2017_.html',\n",
       " 'IJCAI_htmls\\\\view-source_https___www.ijcai.org_Proceedings_2018_.html',\n",
       " 'IJCAI_htmls\\\\view-source_https___www.ijcai.org_Proceedings_2019_.html']"
      ]
     },
     "execution_count": 106,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "html_files = []\n",
    "listdir(path = 'IJCAI_htmls', list_name = html_files, extension_names = ['.html'])\n",
    "html_files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "len: 649\n"
     ]
    }
   ],
   "source": [
    "# 2015\n",
    "html_path = 'IJCAI_htmls\\\\view-source_https___www.ijcai.org_Proceedings_2015.html'\n",
    "with open (html_path, 'rb') as f:\n",
    "    html_content = f.read()\n",
    "pdf_urls_dict_2015 = {}\n",
    "for node in etree.HTML(html_content).xpath('//h3[contains(text(), \"Main Track\")]/following-sibling::p'):\n",
    "    name = node.xpath('text()[1]')[0].split(' / ')[0].strip()\n",
    "    url = 'https://www.ijcai.org{}'\n",
    "    _url = node.xpath('a[contains(text(), \"PDF\")]/@href')[0].strip()\n",
    "    url = url.format(_url)\n",
    "    if url not in pdf_urls_dict_2015.values(): pdf_urls_dict_2015[name] = url\n",
    "print (f'len: {len(pdf_urls_dict_2015.items())}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "len: 651\n"
     ]
    }
   ],
   "source": [
    "# 2016\n",
    "html_path = 'IJCAI_htmls\\\\view-source_https___www.ijcai.org_Proceedings_2016.html'\n",
    "with open (html_path, 'rb') as f:\n",
    "    html_content = f.read()\n",
    "pdf_urls_dict_2016 = {}\n",
    "for node in etree.HTML(html_content).xpath('//a[contains(text(), \"PDF\")]/..'):\n",
    "    try: \n",
    "        page_num = node.xpath('string(.)').split('\\n')[0].split(' / ')[1].strip()\n",
    "        _ = int(page_num)\n",
    "    except ValueError:\n",
    "        continue\n",
    "    except IndexError:\n",
    "        print(repr(node.xpath('string(.)')))\n",
    "        break\n",
    "    \n",
    "    name = node.xpath('string(.)').split('\\n')[0].split(' / ')[0].strip()\n",
    "    url = 'https://www.ijcai.org{}'\n",
    "    _url = node.xpath('a[contains(text(), \"PDF\")]/@href')[0].strip()\n",
    "    url = url.format(_url)\n",
    "    if url not in pdf_urls_dict_2016.values(): pdf_urls_dict_2016[name] = url\n",
    "print (f'len: {len(pdf_urls_dict_2016.items())}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "len: 777\n"
     ]
    }
   ],
   "source": [
    "# 2017\n",
    "html_path = 'IJCAI_htmls\\\\view-source_https___www.ijcai.org_Proceedings_2017_.html'\n",
    "with open (html_path, 'rb') as f:\n",
    "    html_content = f.read()\n",
    "pdf_urls_dict_2017 = {}\n",
    "for node in etree.HTML(html_content).xpath('//div[@class=\"paper_wrapper\"]'):\n",
    "    try: \n",
    "        _page_num = node.xpath('@id')[0][5:]\n",
    "        page_num = int(_page_num)\n",
    "        if page_num < 5: continue\n",
    "    except :\n",
    "        print('ERR:', repr(node.xpath('string(.)')))\n",
    "        break\n",
    "    \n",
    "    name = node.xpath('div[@class=\"title\"]/text()')[0].strip()\n",
    "    url = 'https://www.ijcai.org/Proceedings/2017/{}'\n",
    "    _url = node.xpath('.//a[contains(text(), \"PDF\")]/@href')[0].strip()\n",
    "    url = url.format(_url)\n",
    "    if url not in pdf_urls_dict_2017.values(): pdf_urls_dict_2017[name] = url\n",
    "print (f'len: {len(pdf_urls_dict_2017.items())}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "len: 867\n"
     ]
    }
   ],
   "source": [
    "# 2018\n",
    "html_path = 'IJCAI_htmls\\\\view-source_https___www.ijcai.org_Proceedings_2018_.html'\n",
    "with open (html_path, 'rb') as f:\n",
    "    html_content = f.read()\n",
    "pdf_urls_dict_2018 = {}\n",
    "for node in etree.HTML(html_content).xpath('//div[@class=\"paper_wrapper\"]'):\n",
    "    try: \n",
    "        _page_num = node.xpath('@id')[0][5:]\n",
    "        page_num = int(_page_num)\n",
    "        if page_num < 4: continue\n",
    "    except :\n",
    "        print('ERR:', repr(node.xpath('string(.)')))\n",
    "        break\n",
    "    \n",
    "    name = node.xpath('div[@class=\"title\"]/text()')[0].strip()\n",
    "    url = 'https://www.ijcai.org/Proceedings/2018/{}'\n",
    "    _url = node.xpath('.//a[contains(text(), \"PDF\")]/@href')[0].strip()\n",
    "    url = url.format(_url)\n",
    "    if url not in pdf_urls_dict_2018.values(): pdf_urls_dict_2018[name] = url\n",
    "print (f'len: {len(pdf_urls_dict_2018.items())}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "len: 964\n"
     ]
    }
   ],
   "source": [
    "# 2019\n",
    "html_path = 'IJCAI_htmls\\\\view-source_https___www.ijcai.org_Proceedings_2019_.html'\n",
    "with open (html_path, 'rb') as f:\n",
    "    html_content = f.read()\n",
    "pdf_urls_dict_2019 = {}\n",
    "for node in etree.HTML(html_content).xpath('//div[@class=\"paper_wrapper\"]'):\n",
    "    name = node.xpath('div[@class=\"title\"]/text()')[0].strip()\n",
    "    url = 'https://www.ijcai.org/Proceedings/2019/{}'\n",
    "    _url = node.xpath('.//a[contains(text(), \"PDF\")]/@href')[0].strip()\n",
    "    url = url.format(_url)\n",
    "    if url not in pdf_urls_dict_2019.values(): pdf_urls_dict_2019[name] = url\n",
    "print (f'len: {len(pdf_urls_dict_2019.items())}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "IJCAI DONE!\n"
     ]
    }
   ],
   "source": [
    "res_dict = {}\n",
    "for year, res_dic in zip(range(2015, 2020), \\\n",
    "                         [pdf_urls_dict_2015, pdf_urls_dict_2016, pdf_urls_dict_2017, \\\n",
    "                          pdf_urls_dict_2018, pdf_urls_dict_2019]):\n",
    "    res_dict[f'IJCAI_{year}'] = res_dic\n",
    "    \n",
    "_years = '_'.join([str(i) for i in range(2015, 2020)])\n",
    "with open(f'url_json/IJCAI_{_years}.json', 'w') as f:\n",
    "    json.dump(res_dict, f)\n",
    "print('IJCAI DONE!')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
